import pandas as pd
import numpy as np
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import plotly.express as px
import pickle
from dateutil.parser import *
import warnings
warnings.filterwarnings('ignore')
/home/naru/.pyenv/versions/3.9.13/envs/py3.9.13/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
import os
# from datetime import datetime, timedelta
# def date_range(start, end):
# start = datetime.strptime(start, "%Y%m%d")
# end = datetime.strptime(end, "%Y%m%d")
# dates = [(start + timedelta(days=i)).strftime("%Y%m%d") for i in range((end-start).days+1)]
# return dates
# dates = date_range("20230322", "20230620")
# # print(dates)
date = '20230322'
os.chdir('./data/TCS_TCS원시자료_1일_1일_{}'.format(date))
# print(os.getcwd())
# print(os.listdir())
origin_data = pd.read_csv('/mnt/c/Users/user/source/ai_plus/real_data/data/TCS_TCS원시자료_1일_1일_{0}/{1}'.format(date, os.listdir()[0]), encoding='euc-kr')
origin_data
| 출구본부명 | 출구지사명 | 출구영업소코드 | 출구영업소명 | 처리일자 | 처리일시분초 | TCS차종구분코드 | TCS차종구분명 | 근무일자 | 근무번호 | 확인순번 | TCS본부명 | 지사명 | 영업소코드 | 영업소명 | 발급일시 | 발급시분초 | Unnamed: 17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 강원본부 | 홍천 | 217 | 북원주 | 20230322 | 60406 | 1 | 1종 | 20230322 | 3601 | 7 | 강원본부 | 홍천 | 174 | 춘천 | 322 | 52400 | NaN |
| 1 | 대구경북본부 | 구미 | 126 | 왜관 | 20230322 | 60429 | 1 | 1종 | 20230322 | 3601 | 78 | 대구경북본부 | 대구 | 129 | 북대구 | 322 | 55200 | NaN |
| 2 | 대구경북본부 | 대구 | 135 | 경주 | 20230322 | 60438 | 4 | 4종 | 20230322 | 3501 | 35 | 대구경북본부 | 구미 | 126 | 왜관 | 20230322 | 44900 | NaN |
| 3 | 대구경북본부 | 구미 | 121 | 추풍령 | 20230322 | 60448 | 6 | 6종 | 20230322 | 3101 | 5 | 대구경북본부 | 구미 | 121 | 추풍령 | 321 | 195000 | NaN |
| 4 | 광주전남본부 | 순천 | 271 | 순천 | 20230322 | 60505 | 6 | 6종 | 20230322 | 3801 | 52 | 광주전남본부 | 담양 | 569 | 북광주 | 322 | 51300 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 403017 | 제2서해안고속도로 | 평택시흥 | 683 | 송산마도 | 20230322 | 75729 | 1 | 1종 | 20230322 | 202 | 404 | 제2서해안고속도로 | 평택시흥 | 685 | 서시흥 | 322 | 72900 | NaN |
| 403018 | 제2서해안고속도로 | 평택시흥 | 683 | 송산마도 | 20230322 | 75738 | 1 | 1종 | 20230322 | 202 | 405 | 제2서해안고속도로 | 평택시흥 | 685 | 서시흥 | 322 | 73400 | NaN |
| 403019 | 제2서해안고속도로 | 평택시흥 | 683 | 송산마도 | 20230322 | 75755 | 1 | 1종 | 20230322 | 202 | 406 | 제2서해안고속도로 | 평택시흥 | 685 | 서시흥 | 322 | 72900 | NaN |
| 403020 | 광주전남본부 | 순천 | 272 | 광양 | 20230322 | 53826 | 5 | 5종 | 20230322 | 3801 | 30 | 부산경남본부 | 창원 | 245 | 장유 | 322 | 40300 | NaN |
| 403021 | 수도권본부 | 화성 | 283 | 발안 | 20230322 | 53830 | 1 | 1종 | 20230322 | 3701 | 46 | 수도권본부 | 군포 | 254 | 군자 | 322 | 51400 | NaN |
403022 rows × 18 columns
data = origin_data[['출구지사명', '출구영업소명', '발급일시', '발급시분초']]
data
| 출구지사명 | 출구영업소명 | 발급일시 | 발급시분초 | |
|---|---|---|---|---|
| 0 | 홍천 | 북원주 | 322 | 52400 |
| 1 | 구미 | 왜관 | 322 | 55200 |
| 2 | 대구 | 경주 | 20230322 | 44900 |
| 3 | 구미 | 추풍령 | 321 | 195000 |
| 4 | 순천 | 순천 | 322 | 51300 |
| ... | ... | ... | ... | ... |
| 403017 | 평택시흥 | 송산마도 | 322 | 72900 |
| 403018 | 평택시흥 | 송산마도 | 322 | 73400 |
| 403019 | 평택시흥 | 송산마도 | 322 | 72900 |
| 403020 | 순천 | 광양 | 322 | 40300 |
| 403021 | 화성 | 발안 | 322 | 51400 |
403022 rows × 4 columns
data = origin_data[['출구영업소명', '처리일자', '처리일시분초']]
data
| 출구영업소명 | 처리일자 | 처리일시분초 | |
|---|---|---|---|
| 0 | 북원주 | 20230322 | 60406 |
| 1 | 왜관 | 20230322 | 60429 |
| 2 | 경주 | 20230322 | 60438 |
| 3 | 추풍령 | 20230322 | 60448 |
| 4 | 순천 | 20230322 | 60505 |
| ... | ... | ... | ... |
| 403017 | 송산마도 | 20230322 | 75729 |
| 403018 | 송산마도 | 20230322 | 75738 |
| 403019 | 송산마도 | 20230322 | 75755 |
| 403020 | 광양 | 20230322 | 53826 |
| 403021 | 발안 | 20230322 | 53830 |
403022 rows × 3 columns
data['처리일자'] = data['처리일자'].astype(str)
data['처리일시분초'] = data['처리일시분초'].astype(str)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 403022 entries, 0 to 403021 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 출구영업소명 403022 non-null object 1 처리일자 403022 non-null object 2 처리일시분초 403022 non-null object dtypes: object(3) memory usage: 9.2+ MB
data['처리일시분초'] = data['처리일시분초'].apply( lambda x: x.zfill(6) )
data
| 출구영업소명 | 처리일자 | 처리일시분초 | |
|---|---|---|---|
| 0 | 북원주 | 20230322 | 060406 |
| 1 | 왜관 | 20230322 | 060429 |
| 2 | 경주 | 20230322 | 060438 |
| 3 | 추풍령 | 20230322 | 060448 |
| 4 | 순천 | 20230322 | 060505 |
| ... | ... | ... | ... |
| 403017 | 송산마도 | 20230322 | 075729 |
| 403018 | 송산마도 | 20230322 | 075738 |
| 403019 | 송산마도 | 20230322 | 075755 |
| 403020 | 광양 | 20230322 | 053826 |
| 403021 | 발안 | 20230322 | 053830 |
403022 rows × 3 columns
# data['ds'] = data[['처리일자', '처리일시분초']].ap
data['ds'] = data[['처리일자', '처리일시분초']].apply( ' '.join, axis=1 )
data['ds'] = pd.to_datetime(data.ds)
data.columns = ['user', '처리일자', '처리일시분초', 'ds']
# data.head()
data = data[['user', 'ds']]
data['y'] = 1
# len(data.user.unique())
data
| user | ds | y | |
|---|---|---|---|
| 0 | 북원주 | 2023-03-22 06:04:06 | 1 |
| 1 | 왜관 | 2023-03-22 06:04:29 | 1 |
| 2 | 경주 | 2023-03-22 06:04:38 | 1 |
| 3 | 추풍령 | 2023-03-22 06:04:48 | 1 |
| 4 | 순천 | 2023-03-22 06:05:05 | 1 |
| ... | ... | ... | ... |
| 403017 | 송산마도 | 2023-03-22 07:57:29 | 1 |
| 403018 | 송산마도 | 2023-03-22 07:57:38 | 1 |
| 403019 | 송산마도 | 2023-03-22 07:57:55 | 1 |
| 403020 | 광양 | 2023-03-22 05:38:26 | 1 |
| 403021 | 발안 | 2023-03-22 05:38:30 | 1 |
403022 rows × 3 columns
top10user = data.groupby(by='user').count().sort_values(by='ds', ascending=False).head(10)
users = top10user.reset_index()['user'].unique()
users
array(['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안'],
dtype=object)
user_dfs = [ data[data.user == user][['ds','y']].sort_values('ds').reset_index( drop=True) for user in users ]
users_dict = dict( zip( users, user_dfs ))
users_dict.keys()
dict_keys(['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안'])
# users_dict['서서울']
temp = users_dict['서서울'].set_index('ds').resample( '1T' ).count().reset_index()
temp
| ds | y | |
|---|---|---|
| 0 | 2023-03-22 00:01:00 | 2 |
| 1 | 2023-03-22 00:02:00 | 1 |
| 2 | 2023-03-22 00:03:00 | 1 |
| 3 | 2023-03-22 00:04:00 | 2 |
| 4 | 2023-03-22 00:05:00 | 2 |
| ... | ... | ... |
| 1434 | 2023-03-22 23:55:00 | 4 |
| 1435 | 2023-03-22 23:56:00 | 3 |
| 1436 | 2023-03-22 23:57:00 | 1 |
| 1437 | 2023-03-22 23:58:00 | 1 |
| 1438 | 2023-03-22 23:59:00 | 2 |
1439 rows × 2 columns
# px.line( temp, x='ds', y='y', title='Train data' ).show()
px.scatter( temp, x='ds', y='y', title='Train data' ).show()
# user_dfs = [ data[data.user == user][['ds','y']].sort_values('ds').reset_index( drop=True) for user in users ]
users = ['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안']
# origin_data
origin_data[origin_data['출구영업소명'] == '서서울'].index
7843
# indexes = [ list(origin_data[origin_data['출구영업소명'] == user].index) for user in users ]
# indexes = sum( indexes, [] )
indexes = sum( [ list(origin_data[origin_data['출구영업소명'] == user].index) for user in users ], [] )
origin_data.loc[indexes, ]['출구영업소명'].unique()
array(['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안'],
dtype=object)
top10_data = origin_data.loc[indexes, ]
top10_data.shape, origin_data.shape
((52926, 18), (403022, 18))
top10_data.to_csv('../../sample/sample.csv', header=True )
import os
file_list = [ '/'.join(('./data', dir, os.listdir( f'./data/{dir}')[0])) for dir in os.listdir( './data') ]
print( len(file_list) )
file_list
91
['./data/TCS_TCS원시자료_1일_1일_20230322/TCS_17_04_01_572191.csv', './data/TCS_TCS원시자료_1일_1일_20230323/TCS_17_04_01_594836.csv', './data/TCS_TCS원시자료_1일_1일_20230324/TCS_17_04_01_511825.csv', './data/TCS_TCS원시자료_1일_1일_20230325/TCS_17_04_01_454206.csv', './data/TCS_TCS원시자료_1일_1일_20230326/TCS_17_04_01_997192.csv', './data/TCS_TCS원시자료_1일_1일_20230327/TCS_17_04_01_459303.csv', './data/TCS_TCS원시자료_1일_1일_20230328/TCS_17_04_01_479567.csv', './data/TCS_TCS원시자료_1일_1일_20230329/TCS_17_04_01_698049.csv', './data/TCS_TCS원시자료_1일_1일_20230330/TCS_17_04_01_102969.csv', './data/TCS_TCS원시자료_1일_1일_20230331/TCS_17_04_01_328531.csv', './data/TCS_TCS원시자료_1일_1일_20230401/TCS_17_04_01_585985.csv', './data/TCS_TCS원시자료_1일_1일_20230402/TCS_17_04_01_314188.csv', './data/TCS_TCS원시자료_1일_1일_20230403/TCS_17_04_01_454817.csv', './data/TCS_TCS원시자료_1일_1일_20230404/TCS_17_04_01_208960.csv', './data/TCS_TCS원시자료_1일_1일_20230405/TCS_17_04_01_745475.csv', './data/TCS_TCS원시자료_1일_1일_20230406/TCS_17_04_01_373973.csv', './data/TCS_TCS원시자료_1일_1일_20230407/TCS_17_04_01_793664.csv', './data/TCS_TCS원시자료_1일_1일_20230408/TCS_17_04_01_936033.csv', './data/TCS_TCS원시자료_1일_1일_20230409/TCS_17_04_01_246528.csv', './data/TCS_TCS원시자료_1일_1일_20230410/TCS_17_04_01_374187.csv', './data/TCS_TCS원시자료_1일_1일_20230411/TCS_17_04_01_627491.csv', './data/TCS_TCS원시자료_1일_1일_20230412/TCS_17_04_01_980498.csv', './data/TCS_TCS원시자료_1일_1일_20230413/TCS_17_04_01_181829.csv', './data/TCS_TCS원시자료_1일_1일_20230414/TCS_17_04_01_537217.csv', './data/TCS_TCS원시자료_1일_1일_20230415/TCS_17_04_01_838312.csv', './data/TCS_TCS원시자료_1일_1일_20230416/TCS_17_04_01_208410.csv', './data/TCS_TCS원시자료_1일_1일_20230417/TCS_17_04_01_294167.csv', './data/TCS_TCS원시자료_1일_1일_20230418/TCS_17_04_01_178106.csv', './data/TCS_TCS원시자료_1일_1일_20230419/TCS_17_04_01_592455.csv', './data/TCS_TCS원시자료_1일_1일_20230420/TCS_17_04_01_690603.csv', './data/TCS_TCS원시자료_1일_1일_20230421/TCS_17_04_01_553941.csv', './data/TCS_TCS원시자료_1일_1일_20230422/TCS_17_04_01_698080.csv', './data/TCS_TCS원시자료_1일_1일_20230423/TCS_17_04_01_426892.csv', './data/TCS_TCS원시자료_1일_1일_20230424/TCS_17_04_01_539780.csv', './data/TCS_TCS원시자료_1일_1일_20230425/TCS_17_04_01_210760.csv', './data/TCS_TCS원시자료_1일_1일_20230426/TCS_17_04_01_113010.csv', './data/TCS_TCS원시자료_1일_1일_20230427/TCS_17_04_01_355113.csv', './data/TCS_TCS원시자료_1일_1일_20230428/TCS_17_04_01_569475.csv', './data/TCS_TCS원시자료_1일_1일_20230429/TCS_17_04_01_168462.csv', './data/TCS_TCS원시자료_1일_1일_20230430/TCS_17_04_01_438306.csv', './data/TCS_TCS원시자료_1일_1일_20230501/TCS_17_04_01_549424.csv', './data/TCS_TCS원시자료_1일_1일_20230502/TCS_17_04_01_511276.csv', './data/TCS_TCS원시자료_1일_1일_20230503/TCS_17_04_01_906155.csv', './data/TCS_TCS원시자료_1일_1일_20230504/TCS_17_04_01_845606.csv', './data/TCS_TCS원시자료_1일_1일_20230505/TCS_17_04_01_645771.csv', './data/TCS_TCS원시자료_1일_1일_20230506/TCS_17_04_01_810602.csv', './data/TCS_TCS원시자료_1일_1일_20230507/TCS_17_04_01_625537.csv', './data/TCS_TCS원시자료_1일_1일_20230508/TCS_17_04_01_975005.csv', './data/TCS_TCS원시자료_1일_1일_20230509/TCS_17_04_01_836542.csv', './data/TCS_TCS원시자료_1일_1일_20230510/TCS_17_04_01_448530.csv', './data/TCS_TCS원시자료_1일_1일_20230511/TCS_17_04_01_271156.csv', './data/TCS_TCS원시자료_1일_1일_20230512/TCS_17_04_01_125034.csv', './data/TCS_TCS원시자료_1일_1일_20230513/TCS_17_04_01_847956.csv', './data/TCS_TCS원시자료_1일_1일_20230514/TCS_17_04_01_145481.csv', './data/TCS_TCS원시자료_1일_1일_20230515/TCS_17_04_01_881374.csv', './data/TCS_TCS원시자료_1일_1일_20230516/TCS_17_04_01_539384.csv', './data/TCS_TCS원시자료_1일_1일_20230517/TCS_17_04_01_154911.csv', './data/TCS_TCS원시자료_1일_1일_20230518/TCS_17_04_01_990539.csv', './data/TCS_TCS원시자료_1일_1일_20230519/TCS_17_04_01_988952.csv', './data/TCS_TCS원시자료_1일_1일_20230520/TCS_17_04_01_104770.csv', './data/TCS_TCS원시자료_1일_1일_20230521/TCS_17_04_01_605243.csv', './data/TCS_TCS원시자료_1일_1일_20230522/TCS_17_04_01_273293.csv', './data/TCS_TCS원시자료_1일_1일_20230523/TCS_17_04_01_454634.csv', './data/TCS_TCS원시자료_1일_1일_20230524/TCS_17_04_01_134861.csv', './data/TCS_TCS원시자료_1일_1일_20230525/TCS_17_04_01_787377.csv', './data/TCS_TCS원시자료_1일_1일_20230526/TCS_17_04_01_926023.csv', './data/TCS_TCS원시자료_1일_1일_20230527/TCS_17_04_01_606250.csv', './data/TCS_TCS원시자료_1일_1일_20230528/TCS_17_04_01_772637.csv', './data/TCS_TCS원시자료_1일_1일_20230529/TCS_17_04_01_911282.csv', './data/TCS_TCS원시자료_1일_1일_20230530/TCS_17_04_01_563768.csv', './data/TCS_TCS원시자료_1일_1일_20230531/TCS_17_04_01_188543.csv', './data/TCS_TCS원시자료_1일_1일_20230601/TCS_17_04_01_299447.csv', './data/TCS_TCS원시자료_1일_1일_20230602/TCS_17_04_01_465834.csv', './data/TCS_TCS원시자료_1일_1일_20230603/TCS_17_04_01_632221.csv', './data/TCS_TCS원시자료_1일_1일_20230604/TCS_17_04_01_798608.csv', './data/TCS_TCS원시자료_1일_1일_20230605/TCS_17_04_01_937253.csv', './data/TCS_TCS원시자료_1일_1일_20230606/TCS_17_04_01_103579.csv', './data/TCS_TCS원시자료_1일_1일_20230607/TCS_17_04_01_214514.csv', './data/TCS_TCS원시자료_1일_1일_20230608/TCS_17_04_01_353160.csv', './data/TCS_TCS원시자료_1일_1일_20230609/TCS_17_04_01_491805.csv', './data/TCS_TCS원시자료_1일_1일_20230610/TCS_17_04_01_630451.csv', './data/TCS_TCS원시자료_1일_1일_20230611/TCS_17_04_01_769096.csv', './data/TCS_TCS원시자료_1일_1일_20230612/TCS_17_04_01_421582.csv', './data/TCS_TCS원시자료_1일_1일_20230613/TCS_17_04_01_532517.csv', './data/TCS_TCS원시자료_1일_1일_20230614/TCS_17_04_01_185003.csv', './data/TCS_TCS원시자료_1일_1일_20230615/TCS_17_04_01_295937.csv', './data/TCS_TCS원시자료_1일_1일_20230616/TCS_17_04_01_920712.csv', './data/TCS_TCS원시자료_1일_1일_20230617/TCS_17_04_01_545487.csv', './data/TCS_TCS원시자료_1일_1일_20230618/TCS_17_04_01_656422.csv', './data/TCS_TCS원시자료_1일_1일_20230619/TCS_17_04_01_767326.csv', './data/TCS_TCS원시자료_1일_1일_20230620/TCS_17_04_01_460341.csv']
import os
import pandas as pd
file_list = [ '/'.join(('./data', dir, os.listdir( f'./data/{dir}')[0])) for dir in os.listdir( './data') ]
users = ['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안']
picking_data_files = []
for file in file_list:
print( file, 'start~')
df = pd.read_csv(file, encoding='euc-kr')
indexes = sum( [ list(df[df['출구영업소명'] == user].index) for user in users ], [] )
picking_data_files.append(df.loc[indexes, ])
./data/TCS_TCS원시자료_1일_1일_20230322/TCS_17_04_01_572191.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230323/TCS_17_04_01_594836.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230324/TCS_17_04_01_511825.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230325/TCS_17_04_01_454206.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230326/TCS_17_04_01_997192.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230327/TCS_17_04_01_459303.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230328/TCS_17_04_01_479567.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230329/TCS_17_04_01_698049.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230330/TCS_17_04_01_102969.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230331/TCS_17_04_01_328531.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230401/TCS_17_04_01_585985.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230402/TCS_17_04_01_314188.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230403/TCS_17_04_01_454817.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230404/TCS_17_04_01_208960.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230405/TCS_17_04_01_745475.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230406/TCS_17_04_01_373973.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230407/TCS_17_04_01_793664.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230408/TCS_17_04_01_936033.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230409/TCS_17_04_01_246528.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230410/TCS_17_04_01_374187.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230411/TCS_17_04_01_627491.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230412/TCS_17_04_01_980498.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230413/TCS_17_04_01_181829.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230414/TCS_17_04_01_537217.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230415/TCS_17_04_01_838312.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230416/TCS_17_04_01_208410.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230417/TCS_17_04_01_294167.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230418/TCS_17_04_01_178106.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230419/TCS_17_04_01_592455.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230420/TCS_17_04_01_690603.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230421/TCS_17_04_01_553941.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230422/TCS_17_04_01_698080.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230423/TCS_17_04_01_426892.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230424/TCS_17_04_01_539780.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230425/TCS_17_04_01_210760.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230426/TCS_17_04_01_113010.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230427/TCS_17_04_01_355113.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230428/TCS_17_04_01_569475.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230429/TCS_17_04_01_168462.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230430/TCS_17_04_01_438306.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230501/TCS_17_04_01_549424.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230502/TCS_17_04_01_511276.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230503/TCS_17_04_01_906155.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230504/TCS_17_04_01_845606.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230505/TCS_17_04_01_645771.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230506/TCS_17_04_01_810602.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230507/TCS_17_04_01_625537.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230508/TCS_17_04_01_975005.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230509/TCS_17_04_01_836542.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230510/TCS_17_04_01_448530.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230511/TCS_17_04_01_271156.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230512/TCS_17_04_01_125034.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230513/TCS_17_04_01_847956.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230514/TCS_17_04_01_145481.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230515/TCS_17_04_01_881374.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230516/TCS_17_04_01_539384.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230517/TCS_17_04_01_154911.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230518/TCS_17_04_01_990539.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230519/TCS_17_04_01_988952.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230520/TCS_17_04_01_104770.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230521/TCS_17_04_01_605243.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230522/TCS_17_04_01_273293.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230523/TCS_17_04_01_454634.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230524/TCS_17_04_01_134861.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230525/TCS_17_04_01_787377.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230526/TCS_17_04_01_926023.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230527/TCS_17_04_01_606250.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230528/TCS_17_04_01_772637.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230529/TCS_17_04_01_911282.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230530/TCS_17_04_01_563768.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230531/TCS_17_04_01_188543.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230601/TCS_17_04_01_299447.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230602/TCS_17_04_01_465834.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230603/TCS_17_04_01_632221.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230604/TCS_17_04_01_798608.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230605/TCS_17_04_01_937253.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230606/TCS_17_04_01_103579.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230607/TCS_17_04_01_214514.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230608/TCS_17_04_01_353160.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230609/TCS_17_04_01_491805.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230610/TCS_17_04_01_630451.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230611/TCS_17_04_01_769096.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230612/TCS_17_04_01_421582.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230613/TCS_17_04_01_532517.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230614/TCS_17_04_01_185003.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230615/TCS_17_04_01_295937.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230616/TCS_17_04_01_920712.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230617/TCS_17_04_01_545487.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230618/TCS_17_04_01_656422.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230619/TCS_17_04_01_767326.csv start~ ./data/TCS_TCS원시자료_1일_1일_20230620/TCS_17_04_01_460341.csv start~
# import pickle
# with open( './sample/sample.pickle', 'wb') as fwb:
# pickle.dump( picking_data_files, fwb, protocol=pickle.HIGHEST_PROTOCOL )
full_picking_data = pd.concat(picking_data_files,axis=0)
full_picking_data
| 출구본부명 | 출구지사명 | 출구영업소코드 | 출구영업소명 | 처리일자 | 처리일시분초 | TCS차종구분코드 | TCS차종구분명 | 근무일자 | 근무번호 | 확인순번 | TCS본부명 | 지사명 | 영업소코드 | 영업소명 | 발급일시 | 발급시분초 | Unnamed: 17 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 104 | 수도권본부 | 시흥 | 253 | 서서울 | 20230322 | 74831 | 6 | 6종 | 20230322 | 4602 | 117 | 수도권본부 | 군포 | 256 | 안산 | 322 | 74100 | NaN |
| 153 | 수도권본부 | 시흥 | 253 | 서서울 | 20230322 | 103844 | 1 | 1종 | 20230322 | 4001 | 405 | 대전충남본부 | 당진 | 288 | 당진 | 20230322 | 95800 | NaN |
| 423 | 수도권본부 | 시흥 | 253 | 서서울 | 20230322 | 84252 | 1 | 1종 | 20230322 | 4001 | 200 | 수도권본부 | 군포 | 102 | 동수원 | 20230322 | 82600 | NaN |
| 435 | 수도권본부 | 시흥 | 253 | 서서울 | 20230322 | 84806 | 6 | 6종 | 20230322 | 4602 | 174 | 수도권본부 | 화성 | 282 | 비봉 | 322 | 83600 | NaN |
| 452 | 수도권본부 | 시흥 | 253 | 서서울 | 20230322 | 85331 | 4 | 4종 | 20230322 | 3801 | 172 | 강원본부 | 이천 | 173 | 양지 | 322 | 80100 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 352065 | 대전충남본부 | 천안 | 108 | 천안 | 20230620 | 133405 | 1 | 1종 | 20230620 | 3501 | 542 | 수도권본부 | 수원 | 103 | 수원신갈 | 20230620 | 125000 | NaN |
| 352066 | 대전충남본부 | 천안 | 108 | 천안 | 20230620 | 133423 | 1 | 1종 | 20230620 | 3501 | 543 | 수도권본부 | 수원 | 742 | 남사진위 | 620 | 123600 | NaN |
| 352067 | 대전충남본부 | 천안 | 108 | 천안 | 20230620 | 133611 | 1 | 1종 | 20230620 | 3501 | 544 | 수도권본부 | 군포 | 102 | 동수원 | 620 | 125800 | NaN |
| 352068 | 대전충남본부 | 천안 | 108 | 천안 | 20230620 | 133633 | 1 | 1종 | 20230620 | 3501 | 545 | 수도권본부 | 군포 | 254 | 군자 | 620 | 114400 | NaN |
| 352069 | 대전충남본부 | 천안 | 108 | 천안 | 20230620 | 133716 | 1 | 1종 | 20230620 | 3501 | 546 | 대전충남본부 | 천안 | 107 | 안성 | 20230620 | 130000 | NaN |
4846621 rows × 18 columns
full_picking_data.shape
(4846621, 18)
full_picking_data.columns
Index(['출구본부명', '출구지사명', '출구영업소코드', '출구영업소명', '처리일자', '처리일시분초', 'TCS차종구분코드',
'TCS차종구분명', '근무일자', '근무번호', '확인순번', 'TCS본부명', '지사명', '영업소코드', '영업소명',
'발급일시', '발급시분초', 'Unnamed: 17'],
dtype='object')
full_picking_data[[ '출구영업소명', '처리일자', '처리일시분초']]
| 출구영업소명 | 처리일자 | 처리일시분초 | |
|---|---|---|---|
| 104 | 서서울 | 20230322 | 074831 |
| 153 | 서서울 | 20230322 | 103844 |
| 423 | 서서울 | 20230322 | 084252 |
| 435 | 서서울 | 20230322 | 084806 |
| 452 | 서서울 | 20230322 | 085331 |
| ... | ... | ... | ... |
| 352065 | 천안 | 20230620 | 133405 |
| 352066 | 천안 | 20230620 | 133423 |
| 352067 | 천안 | 20230620 | 133611 |
| 352068 | 천안 | 20230620 | 133633 |
| 352069 | 천안 | 20230620 | 133716 |
4846621 rows × 3 columns
len(full_picking_data['처리일자'].unique())
91
full_picking_data['처리일자'] = full_picking_data['처리일자'].astype(str)
full_picking_data['처리일시분초'] = full_picking_data['처리일시분초'].astype(str)
full_picking_data['처리일시분초'] = full_picking_data['처리일시분초'].apply( lambda x: x.zfill(6) )
full_picking_data['ds'] = full_picking_data[['처리일자', '처리일시분초']].apply( ' '.join, axis=1 )
full_picking_data['ds'] = pd.to_datetime(full_picking_data.ds)
full_picking_data[[ '출구영업소명', 'ds']]
| 출구영업소명 | ds | |
|---|---|---|
| 104 | 서서울 | 2023-03-22 07:48:31 |
| 153 | 서서울 | 2023-03-22 10:38:44 |
| 423 | 서서울 | 2023-03-22 08:42:52 |
| 435 | 서서울 | 2023-03-22 08:48:06 |
| 452 | 서서울 | 2023-03-22 08:53:31 |
| ... | ... | ... |
| 352065 | 천안 | 2023-06-20 13:34:05 |
| 352066 | 천안 | 2023-06-20 13:34:23 |
| 352067 | 천안 | 2023-06-20 13:36:11 |
| 352068 | 천안 | 2023-06-20 13:36:33 |
| 352069 | 천안 | 2023-06-20 13:37:16 |
4846621 rows × 2 columns
compat_data = full_picking_data[[ '출구영업소명', 'ds']]
compat_data['y'] = 1
compat_data.columns = [ 'user', 'ds', 'y']
compat_data.sort_values('ds', inplace=True)
compat_data.reset_index( drop=True, inplace=True)
compat_data
/tmp/ipykernel_13028/174773155.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
compat_data['y'] = 1
/tmp/ipykernel_13028/174773155.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
compat_data.sort_values('ds', inplace=True)
| user | ds | y | |
|---|---|---|---|
| 0 | 북대구 | 2023-03-22 00:00:06 | 1 |
| 1 | 서대구 | 2023-03-22 00:00:10 | 1 |
| 2 | 동서울 | 2023-03-22 00:00:15 | 1 |
| 3 | 군자 | 2023-03-22 00:00:19 | 1 |
| 4 | 서울 | 2023-03-22 00:00:21 | 1 |
| ... | ... | ... | ... |
| 4846616 | 서울 | 2023-06-20 23:59:43 | 1 |
| 4846617 | 서서울 | 2023-06-20 23:59:45 | 1 |
| 4846618 | 동서울 | 2023-06-20 23:59:45 | 1 |
| 4846619 | 군자 | 2023-06-20 23:59:54 | 1 |
| 4846620 | 동서울 | 2023-06-20 23:59:54 | 1 |
4846621 rows × 3 columns
# import pickle
# with open( './sample/top10users.pickle', 'wb') as fwb:
# pickle.dump( compat_data, fwb, protocol=pickle.HIGHEST_PROTOCOL )
import pickle
with open( './sample/top10users.pickle', 'rb') as frb:
compat_data = pickle.load( frb )
smp_data = compat_data[compat_data.user == '서서울'][['ds','y']].sort_values('ds').reset_index( drop=True)
smp_data
| ds | y | |
|---|---|---|
| 0 | 2023-03-22 00:01:25 | 1 |
| 1 | 2023-03-22 00:01:42 | 1 |
| 2 | 2023-03-22 00:02:04 | 1 |
| 3 | 2023-03-22 00:03:39 | 1 |
| 4 | 2023-03-22 00:04:33 | 1 |
| ... | ... | ... |
| 739969 | 2023-06-20 23:58:27 | 1 |
| 739970 | 2023-06-20 23:58:51 | 1 |
| 739971 | 2023-06-20 23:59:03 | 1 |
| 739972 | 2023-06-20 23:59:13 | 1 |
| 739973 | 2023-06-20 23:59:45 | 1 |
739974 rows × 2 columns
users = ['서서울', '서울', '대동', '동서울', '군자', '북평택', '북대구', '서시흥', '서대구', '천안']
for idx, user in enumerate(users):
print(
f'''
user : {idx} - {user}
max : {compat_data[compat_data.user == user][['ds','y']].sort_values('ds').set_index('ds').resample( '1T' ).count().reset_index()['y'].max()}
'''
)
# compat_data[compat_data.user == user][['ds','y']].sort_values('ds').set_index('ds').resample( '1T' ).count().reset_index()['y'].max()
user : 0 - 서서울
max : 20
user : 1 - 서울
max : 19
user : 2 - 대동
max : 18
user : 3 - 동서울
max : 15
user : 4 - 군자
max : 14
user : 5 - 북평택
max : 15
user : 6 - 북대구
max : 14
user : 7 - 서시흥
max : 11
user : 8 - 서대구
max : 14
user : 9 - 천안
max : 13
temp_data = smp_data[['ds', 'y']].set_index('ds').resample( '1T' ).count().reset_index()
px.line( temp_data, x='ds', y='y', title='Train data' ).show()
px.scatter( temp_data, x='ds', y='y', title='Train data' ).show()